import requests
from bs4 import BeautifulSoup
import pymysql
class DBTool():
    def __init__(self):
        self.conn = pymysql.connect(host='localhost', database="paper", user='gao', password='347210', charset='utf8')
        self.cursor = self.conn.cursor()
    def insert(self, title, author, abstracts, pdf_url, pub_date):
        flag = False
        try:
            self.cursor.execute('INSERT INTO paper (title, author, abstracts, pdf_url, pub_date) VALUES (%s, %s, %s, %s, %s)',
                                (title, author, abstracts, pdf_url, pub_date))
            self.conn.commit()
            flag = True
        except Exception as e:
            print(e)
        return flag
    def queryOne(self, title):
        self.cursor.execute('SELECT * FROM paper WHERE title=%s', (title,))
        return self.cursor.fetchone()
    def queryAll(self):
        self.cursor.execute('SELECT * FROM paper')
        return self.cursor.fetchall()
def scrawler(url, selector, flag):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'}
    r = requests.get(url, headers=headers)
    codes = r.text
    bs = BeautifulSoup(codes, 'html.parser')
    items = bs.select(selector=selector)
    if flag == 0:
        return [item.text for item in items]
    elif flag == 1:
        return [item.attrs['href'] for item in items]
if __name__ == '__main__':
    url = "https://cjb.ijournals.cn/cjbcn/article/abstract/gc20081556?st=article_issue"
    db = DBTool()

    title_list = scrawler(url, "div.zh > div.title", flag=0)
    title = title_list[0] if title_list else ''  # 取列表的第一个元素或空字符串
    author_list = scrawler(url, "#cp-cont", flag=0)
    cleaned_author_list = [author.strip() for author in author_list]    # 使用列表推导式去除每个作者名称前后的空格
    author = cleaned_author_list[0] if cleaned_author_list else ''
    abstracts_list = scrawler(url, "#CnAbstractValue", flag=0)
    abstracts = abstracts_list[0] if abstracts_list else ''  # 取列表的第一个元素或空字符串
    pdf_url_list = scrawler(url, "#PdfUrl", flag=1)
    pdf_url = pdf_url_list[0] if pdf_url_list else ''
    pub_date_list = scrawler(url, "#PublishTimeValue", flag=0)
    pub_date = pub_date_list[0] if pub_date_list else ''
    print(title)
    print(author)
    print(abstracts)
    print(pdf_url)
    print(pub_date)
    # if db.insert(title, author, abstracts, pdf_url, pub_date):
    #     print(db.queryAll())